! Copyright (C) 2001-2009 Quantum ESPRESSO group
! Copyright (C) 2009 Brian Kolb, Timo Thonhauser - Wake Forest University
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
! ----------------------------------------------------------------------


PROGRAM GENERATE_KERNEL_


! This is a stand-alone program to generate the file "vdW_kernel_table"
! needed for a van der Waals run. There should be no need, in general,
! to use this program as the default kernel file supplied with the
! distribution should suffice for most cases. However, if that file is
! insufficient for a particular purpose, a more suitable kernel file can
! be generated by running this program.
!
!
! The kernel itself is defined in
!
!    M. Dion, H. Rydberg, E. Schroeder, D. C. Langreth, and
!    B. I. Lundqvist, Phys. Rev. Lett. 92, 246401 (2004).
!
! henceforth referred to as DION. Further information about the
! functional and its corresponding potential can be found in:
!
!    T. Thonhauser, V.R. Cooper, S. Li, A. Puzder, P. Hyldgaard,
!    and D.C. Langreth, Phys. Rev. B 76, 125112 (2007).
!
! The proper spin extension of vdW-DF, i.e. svdW-DF, uses the same
! kernel and is derived in
!
!    T. Thonhauser et al., TBD
!
! henceforth refered to THONHAUSER.
!
!
! Two review article show many of the vdW-DF applications:
!
!    D. C. Langreth et al., J. Phys.: Condens. Matter 21, 084203 (2009).
!
!    K. Berland et al, Rep. Prog. Phys. 78, 066501 (2015).
!
!
! The method implemented is based on the method of G. Roman-Perez and
! J. M. Soler described in:
!
!    G. Roman-Perez and J. M. Soler, PRL 103, 096102 (2009)
!
! henceforth referred to as SOLER.
!
!
! The original definition of the kernel function is given in DION
! equations 14-16. The Soler method makes the kernel function a function
! of only 1 variable (r) by first putting it in the form phi(q1*r,
! q2*r). Then, the q-dependence is removed by expanding the function in
! a special way (see SOLER equation 3). This yields a separate function
! for each pair of q points that is a function of r alone. There are
! (N^2+N)/2 unique functions, where N is the number of q points used. In
! the Soler method, the kernel is first made in the form phi(d1, d2) but
! this is not done here. It was found that, with q's chosen judiciously
! ahead of time, the kernel and the second derivatives required for
! interpolation could be tabulated ahead of time for faster use of the
! vdW_FD functional. Through testing we found no need to soften the
! kernel and correct for this later (see SOLER eqations 6-7).
!
! The algorithm employed here is "embarrassingly parallel," meaning that
! it parallelizes very well up to (N^2+N)/2 processors, where, again, N
! is the number of q points chosen. However, parallelization on this
! scale is unnecessary. In testing the code runs in under a minute on 16
! Intel Xeon processors.
!
! IMPORTANT NOTICE: results are very sensitive to compilation details.
! In particular, the usage of FMA (Fused Multiply-and-Add) instructions
! used by modern CPUs such as AMD Interlagos (Bulldozer) and Intel Ivy
! Bridge may affect quite heavily some components of the kernel table
! (communication by Ake Sandberg, Umea University). In practice this
! should not be a problem, since most affected elements are the less
! relevant ones.
!
! Some of the algorithms here are somewhat modified versions of those
! found in the book:
!
!    Numerical Recipes in C; William H. Press, Brian P. Flannery, Saul A.
!    Teukolsky, and William T. Vetterling.  Cambridge University Press (1988).
!
! hereafter referred to as NUMERICAL_RECIPES. The routines were
! translated to Fortran, of course and variable names are generally
! different.
!
!
! For the calculation of the kernel we have benefited from access to
! earlier vdW-DF implementation into PWscf and ABINIT, written by Timo
! Thonhauser, Valentino Cooper, and David Langreth. These codes, in
! turn, benefited from earlier codes written by Maxime Dion and Henrik
! Rydberg.


use mp_global,            ONLY : mp_startup, mp_global_end
use mp_world,             ONLY : world_comm

implicit none  

call mp_startup ()
call generate_kernel ( world_comm )
call mp_global_end( )

END PROGRAM GENERATE_KERNEL_








! ######################################################################
!                           |                 |
!                           | GENERATE_KERNEL |
!                           |_________________|
!
SUBROUTINE generate_kernel ( my_comm )

use mp,                   ONLY : mp_get, mp_barrier, mp_size, mp_rank
use kinds,                ONLY : dp
use io_global,            ONLY : ionode
use constants,            ONLY : pi

implicit none

integer, intent (in) :: my_comm




! ----------------------------------------------------------------------
! User set-able parameters.

integer, parameter :: Nr_points = 1024   ! The number of radial points (also the number of k points)
                                         ! used in the formation of the kernel functions for each
                                         ! pair of q values. Increasing this value will help in
                                         ! case you get a run-time error saying that you are trying
                                         ! to use a k value that is larger than the largest
                                         ! tabulated k point since the largest k point will be
                                         ! 2*pi/r_max * Nr_points. Memory usage of the vdW_DF piece
                                         ! of PWSCF will increase roughly linearly with this
                                         ! variable.

real(dp), parameter :: r_max = 100.0D0   ! The value of the maximum radius to use for the real-space
                                         ! kernel functions for each pair of q values. The larger
                                         ! this value is the smaller the smallest k value will be
                                         ! since the smallest k point value is 2*pi/r_max. Be
                                         ! careful though, since this will also decrease the maximum
                                         ! k point value and the vdW_DF code will crash if it
                                         ! encounters a g-vector with a magnitude greater than
                                         ! 2*pi/r_max *Nr_points


! ----------------------------------------------------------------------
! Integration parameters for the kernel. These are based on DION.
! Changing these MAY make the kernel more accurate. They will not
! affect the run time or memory usage of the vdW-DF code.

integer, parameter :: Nintegration_points = 256   ! Number of integration points for real-space
                                                  ! kernel generation (see DION equation 14).
                                                  ! This is how many a's and b's there will be.

real(dp), parameter :: a_min = 0.0D0              ! Starting value for the a and b integration
                                                  ! in DION equation 14.

real(dp), parameter :: a_max = 64.0D0             ! Maximum value for the a and b integration
                                                  ! in DION equation 14.

CHARACTER(LEN=30) :: double_format = "(1p4e23.14)"


! ----------------------------------------------------------------------
! The next 2 parameters define the q mesh to be used in the vdW_DF code.
! These are perhaps the most important to have set correctly. Increasing
! the number of q points will DRAMATICALLY increase the memory usage of
! the vdW_DF code because the memory consumption depends quadratically
! on the number of q points in the mesh. Increasing the number of q
! points may increase accuracy of the vdW_DF code, although, in testing
! it was found to have little effect. The largest value of the q mesh
! is q_cut. All values of q0 (DION equation 11) larger than this value
! during a run will be saturated to this value using equation 5 of
! SOLER. In testing, increasing the value of q_cut was found to have
! little impact on the results, though it is possible that in some
! systems it may be more important. Always make sure that the variable
! Nqs is consistent with the number of q points that are actually in the
! variable q_mesh. Also, do not set any q value to 0. This will cause an
! infinity in the Fourier transform.
!
!
!                CHANGE THESE VALUES AT YOUR OWN RISK

integer, parameter :: Nqs = 20

real(dp), dimension(Nqs):: q_mesh = (/  &
    1.0D-5             , 0.0449420825586261D0, 0.0975593700991365D0, 0.159162633466142D0, &
    0.231286496836006D0, 0.315727667369529D0 , 0.414589693721418D0 , 0.530335368404141D0, &
    0.665848079422965D0, 0.824503639537924D0 , 1.010254382520950D0 , 1.227727621364570D0, &
    1.482340921174910D0, 1.780437058359530D0 , 2.129442028133640D0 , 2.538050036534580D0, &
    3.016440085356680D0, 3.576529545442460D0 , 4.232271035198720D0 , 5.0D0 /)


! ----------------------------------------------------------------------
! The following are a few suggested sets of parameters that may be
! useful in some systems. Again, only change the default values if 1)
! you know what you're doing and 2) the default values are insufficient
! (or suspected to be insufficient) for your particular system. Use
! these sets by commenting out the definition of Nqs and q_mesh above
! and uncommenting 1 of the desired sets below. You may also make your
! own set if you know what you're doing.


! ----------------------------------------------------------------------
! Uncomment to use a q_mesh of 25 points with a cutoff of 5.
!integer, parameter :: Nqs = 25
!real(dp), dimension(Nqs) :: q_mesh = (/ 1.0D-5, 0.0319324863726618D0, 0.0683071727114252D0, &
!     0.109742023439998D0, 0.156940969402303D0, 0.210705866844455D0, &
!     0.271950120037604D0, 0.341714198974465D0, 0.421183315767499D0, &
!     0.511707560050586D0, 0.614824835461683D0, 0.732286986871156D0, &
!     0.866089562227575D0, 1.01850571464079D0, 1.19212482065999D0, &
!     1.38989647082725D0, 1.61518057985587D0, 1.87180446774829D0, &
!     2.16412788159658D0, 2.49711706271187D0, 2.87642911739861D0, &
!     3.30850812473687D0, 3.80069461413434D0, 4.36135027254676D0, &
!     5.0D0 /)


! ----------------------------------------------------------------------
! Uncomment to use a q_mesh of 30 points with a cutoff of 5.
! integer, parameter :: Nqs = 30
! real(dp), dimension(Nqs) :: q_mesh = (/ 1.0D-5, 0.026559672691443D0, 0.0561185595841672D0, &
!      0.08901534278204D0, 0.125626949595767D0, 0.166372871329829D0, &
!      0.211719969762446D0, 0.262187826390619D0, 0.318354695731256D0, &
!      0.380864130890569D0, 0.4504323573167D0, 0.527856479223139D0, &
!      0.61402361271113D0, 0.709921050237249D0, 0.816647572889386D0, &
!      0.935426040085808D0, 1.06761740094853D0, 1.21473628789156D0, &
!      1.37846837109353D0, 1.56068967270003D0, 1.76348806205544D0, &
!      1.98918717825406D0, 2.24037305411209D0, 2.51992374661476D0, &
!      2.83104231334061D0, 3.17729351270267D0, 3.56264464851356D0, &
!      3.99151102686645D0, 4.46880654617114D0, 5.0D0 &
!      /)


! ----------------------------------------------------------------------
! Uncomment to use a q_mesh of 30 poits with a cutoff of 8
! integer, parameter :: Nqs = 30
! real(dp), dimension(Nqs) :: q_mesh = (/ 1.0D-5, 0.0424954763063088D0, 0.0897896953346675D0, &
!      0.142424548451264D0, 0.201003119353227D0, 0.266196594127727D0, &
!      0.338751951619913D0, 0.41950052222499D0, 0.50936751317001D0, &
!      0.609382609424911D0, 0.720691771706719D0, 0.844570366757022D0, &
!      0.982437780337809D0, 1.1358736803796D0, 1.30663611662302D0, &
!      1.49668166413729D0, 1.70818784151764D0, 1.94357806062649D0, &
!      2.20554939374965D0, 2.49710347632005D0, 2.82158089928871D0, &
!      3.18269948520649D0, 3.58459688657934D0, 4.03187799458362D0, &
!      4.52966770134498D0, 5.08366962032427D0, 5.7002314376217D0, &
!      6.38641764298631D0, 7.15009047387383D0, 8.0D0&
!      /)








! ######################################################################
! ############## DO NOT CHANGE ANYTHING BELOW THIS LINE ################
! ######################################################################

integer :: a_i, b_i, q1_i, q2_i, r_i, count                         ! Indexing variables.

real(dp) :: weights(Nintegration_points)                            ! Array to hold dx values for
                                                                    ! the Gaussian-Legendre
                                                                    ! integration of the kernel.

real(dp) :: nu(Nintegration_points), nu1(Nintegration_points)       ! Defined in the discussion
                                                                    ! below equation 16 of DION.

real(dp) :: a(Nintegration_points), a2(Nintegration_points)         ! The values of the points a
                                                                    ! (DION equation 14) and a^2.

real(dp) :: sin_a(Nintegration_points), cos_a(Nintegration_points)  ! Sine and cosine values of the
                                                                    ! aforementioned points a.

real(dp) :: W_ab(Nintegration_points, Nintegration_points)          ! Defined in DION equation 16.

real(dp) :: dr, d1, d2, d, w, x, y, z, T, integral                  ! Intermediate values.

real(dp) :: gamma = 4.0D0*pi/9.0D0                                  ! Multiplicative factor for
                                                                    ! exponent in the functions
                                                                    ! called "h" in DION.

real(dp), parameter :: small = 1.0D-15                              ! Number at which to employ
                                                                    ! special algorithms to avoid
                                                                    ! numerical problems. This is
                                                                    ! probably not needed but I like
                                                                    ! to be careful.


! ----------------------------------------------------------------------
! The following sets up a parallel run.

integer :: my_start_q, my_end_q, Ntotal                  ! Starting and ending q value for each
                                                         ! processor, also the total number of !
                                                         ! calculations to do ( (Nqs^2 + Nqs)/2 ).

real(dp), allocatable :: phi(:,:), d2phi_dk2(:,:)        ! Arrays to store the kernel functions and
                                                         ! their second derivatives. They are
                                                         ! stored as phi(radial_point, idx).

integer, allocatable :: indices(:,:), proc_indices(:,:)  ! Indices holds the values of q1 and q2 as
                                                         ! partitioned out to the processors. It is
                                                         ! an Ntotal x 2 array stored as
                                                         ! indices(index of point number, q1:q2).
                                                         ! Proc_indices holds the section of the
                                                         ! indices array that is assigned to each
                                                         ! processor. This is a Nprocs x 2 array,
                                                         ! stored as proc_indices(processor number,
                                                         ! starting_index:ending_index)

integer :: Nper, Nextra, start_q, end_q                  ! Baseline number of jobs per processor,
                                                         ! number of processors that get an extra
                                                         ! job in case the number of jobs doesn't
                                                         ! split evenly over the number of
                                                         ! processors, starting index into the
                                                         ! indices array, ending index into the
                                                         ! indices array.

integer :: nproc, mpime                                  ! Number or procs, rank of current
                                                         ! processor

integer :: idx, proc_i, kernel_file, my_Nqs




! ----------------------------------------------------------------------
! The total number of phi_alpha_beta functions that have to be
! calculated.

Ntotal = (Nqs**2 + Nqs)/2

allocate( indices(Ntotal, 2) )

count = 1


! ----------------------------------------------------------------------
! This part fills in the indices array. It just loops through the q1 and
! q2 values and stores them. Sections of this array will be assigned to
! each of the processors later.

do q1_i = 1, Nqs
   do q2_i = 1, q1_i

      indices(count, 1) = q1_i
      indices(count, 2) = q2_i

      count = count + 1

   end do
end do


! ----------------------------------------------------------------------
! Figure out the baseline number of functions to be calculated by each
! processor and how many processors get 1 extra job.

nproc  = mp_size(my_comm)
mpime  = mp_rank(my_comm)
Nper   = Ntotal/nproc
Nextra = mod(Ntotal, nproc)

allocate(proc_indices(nproc,2) )

start_q = 0
end_q = 0


! ----------------------------------------------------------------------
! Loop over all the processors and figure out which section of the
! indices array each processor should do. All processors figure this
! out for every processor so there is no need to communicate results.

do proc_i = 1, nproc

   start_q = end_q + 1
   end_q = start_q + (Nper - 1)

   if (proc_i <= Nextra) end_q = end_q + 1

   if (proc_i == (mpime+1)) then

      my_start_q = start_q
      my_end_q = end_q

   end if

   proc_indices(proc_i, 1) = start_q
   proc_indices(proc_i, 2) = end_q

end do


! ----------------------------------------------------------------------
! Store how many jobs are assigned to me.

my_Nqs = my_end_q - my_start_q + 1

allocate( phi(0:Nr_points, my_Nqs), d2phi_dk2(0:Nr_points, my_Nqs) )

phi = 0.0D0
d2phi_dk2 = 0.0D0

dr = (r_max)/(Nr_points)


! ----------------------------------------------------------------------
! Find the integration points we are going to use in the
! Gaussian-Legendre integration.

call prep_gaussian_quadrature(a_min, a_max, a, weights, Nintegration_points)


! ----------------------------------------------------------------------
! Get a, a^2, sin(a), cos(a) and the weights for the Gaussian-Legendre
! integration.

do a_i=1, Nintegration_points

   a(a_i) = tan(a(a_i))
   a2(a_i) = a(a_i)**2
   weights(a_i) = weights(a_i)*(1+a2(a_i))
   cos_a(a_i) = cos(a(a_i))
   sin_a(a_i) = sin(a(a_i))

end do


! ----------------------------------------------------------------------
! Calculate the value of the W function defined in DION equation 16 for
! each value of a and b.

do a_i = 1, Nintegration_points
   do b_i = 1, Nintegration_points

      W_ab(a_i, b_i) = 2.0D0 * weights(a_i)*weights(b_i) * ( &
           (3.0D0-a2(a_i))*a(b_i)*cos_a(b_i)*sin_a(a_i)  +   &
           (3.0D0-a2(b_i))*a(a_i)*cos_a(a_i)*sin_a(b_i)  +   &
           (a2(a_i)+a2(b_i)-3.0D0)*sin_a(a_i)*sin_a(b_i) -   &
           3.0D0*a(a_i)*a(b_i)*cos_a(a_i)*cos_a(b_i) )   /   &
           (a(a_i)*a(b_i))

   enddo
enddo


! ----------------------------------------------------------------------
! Now, we loop over all the pairs q1,q2 that are assigned to us and
! perform our calculations.

do idx = 1, my_Nqs

   ! -------------------------------------------------------------------
   ! First, get the value of phi(q1*r, q2*r) for each r and the
   ! particular values of q1 and q2 we are using.

   do r_i = 1, Nr_points

      d1 = q_mesh(indices(idx+my_start_q-1, 1)) * (dr * r_i)
      d2 = q_mesh(indices(idx+my_start_q-1, 2)) * (dr * r_i)

      phi(r_i, idx) = phi_value(d1, d2)

   end do


   ! -------------------------------------------------------------------
   ! Now, perform a radial FFT to turn our phi_alpha_beta(r) into
   ! phi_alpha_beta(k) needed for SOLER equation 8.

   call radial_fft( phi(:,idx) )


   ! -------------------------------------------------------------------
   ! Determine the spline interpolation coefficients for the Fourier
   ! transformed kernel function.

   call set_up_splines( phi(:, idx), d2phi_dk2(:, idx) )

end do


! ----------------------------------------------------------------------
! Finally, we write out the results, after letting everybody catch up.

call mp_barrier( my_comm )
call write_kernel_table_file(phi, d2phi_dk2)


deallocate( phi, d2phi_dk2, indices, proc_indices )


CONTAINS




  ! ####################################################################
  !                          |                  |
  !                          |  SET UP SPLINES  |
  !                          |__________________|
  !
  ! This subroutine accepts a function (phi) and finds at each point the
  ! second derivative (D2) for use with spline interpolation. This
  ! function assumes we are using the expansion described in SOLER
  ! equation 3. That is, the derivatives are those needed to
  ! interpolate Kronecker delta functions at each of the q values. Other
  ! than some special modification to speed up the algorithm in our
  ! particular case, this algorithm is taken directly from
  ! NUMERICAL_RECIPES pages 96-97.

  subroutine set_up_splines(phi, D2)

  real(dp), intent(in) :: phi(0:Nr_points)    ! The k-space kernel function for a particular
                                              ! q1 and q2.

  real(dp), intent(inout) :: D2(0:Nr_points)  ! The second derivatives to be used in the
                                              ! interpolation expansion (SOLER equation 3).

  real(dp), save :: dk = 2.0D0*pi/r_max       ! Spacing of k points.

  real(dp), allocatable :: temp_array(:)      ! Temporary storage.
  real(dp) :: temp_1, temp_2




  allocate( temp_array(0:Nr_points) )

  D2 = 0

  temp_array = 0

  do r_i = 1, Nr_points - 1

     temp_1 = dble(r_i - (r_i - 1))/dble( (r_i + 1) - (r_i - 1) )
     temp_2 = temp_1 * D2(r_i-1) + 2.0D0

     D2(r_i) = (temp_1 - 1.0D0)/temp_2

     temp_array(r_i) = ( phi(r_i+1) - phi(r_i))/dble( dk*((r_i+1) - r_i) ) - &
                       ( phi(r_i) - phi(r_i-1))/dble( dk*(r_i - (r_i-1)) )
     temp_array(r_i) = (6.0D0*temp_array(r_i)/dble( dk*((r_i+1) - (r_i-1)) ) - &
                       temp_1*temp_array(r_i-1))/temp_2

  end do

  D2(Nr_points) = 0.0D0

  do  r_i = Nr_points-1, 0, -1

     D2(r_i) = D2(r_i)*D2(r_i+1) + temp_array(r_i)

  end do

  deallocate( temp_array )

  end subroutine set_up_splines








  ! ####################################################################
  !                            |             |
  !                            |  PHI_VALUE  |
  !                            |_____________|
  !
  ! This function returns the value of the kernel calculated via DION
  ! equation 14.


  real(dp) function phi_value(d1, d2)

  real(dp), intent(in) :: d1, d2       ! The point at which to evaluate the kernel.
                                       ! Note that d1 = q1*r and d2 = q2*r.




  phi_value = 0.0D0

  if (d1==0 .and. d2==0) then

     phi_value = 0.0
     return

  end if


  ! --------------------------------------------------------------------
  ! Loop over all integration points and calculate the value of the nu
  ! functions defined in the discussion below equation 16 in DION. There
  ! are a number of checks here to ensure that we don't run into
  ! numerical problems for very small d values. They are probably
  ! unnecessary but I wanted to be careful.

  do a_i = 1, Nintegration_points

     if ( a(a_i) <= small .and. d1 > small) then

        nu(a_i) = 9.0D0/8.0D0*d1**2/pi

     else if (d1 <= small) then

        nu(a_i) = a(a_i)**2/2.0D0

     else

        nu(a_i) = a(a_i)**2/((-exp(-(a(a_i)**2*gamma)/d1**2) + 1.0D0)*2.0D0)

     end if


     if ( a(a_i) <= small .and. d2 > small) then

        nu1(a_i) = 9.0D0/8.0D0*d2**2/pi

     else if (d2 < small) then

        nu1(a_i) = a(a_i)**2/2.0D0

     else

        nu1(a_i) = a(a_i)**2/((-exp(-(a(a_i)**2*gamma)/d2**2) + 1.0D0)*2.0D0)

     end if

  end do


  ! --------------------------------------------------------------------
  ! Carry out the integration of DION equation 13.

  do a_i = 1, Nintegration_points
     do b_i = 1, Nintegration_points

        w = nu(a_i)
        x = nu(b_i)
        y = nu1(a_i)
        z = nu1(b_i)

        ! --------------------------------------------------------------
        ! Again, watch out for possible numerical problems

        if (w < small .or. x<small .or. y<small .or. z<small) cycle

        T = (1.0D0/(w+x) + 1.0D0/(y+z))*(1.0D0/((w+y)*(x+z)) + 1.0D0/((w+z)*(y+x)))

        phi_value = phi_value + T * W_ab(a_i, b_i)

     end do
  end do

  phi_value = 1.0D0/pi**2*phi_value

  end function phi_value








  ! ####################################################################
  !                    |                            |
  !                    |  PREP_GAUSSIAN_QUADRATURE  |
  !                    |____________________________|
  !
  !
  ! Routine to calculate the points and weights for the
  ! Gaussian-Legendre integration. This routine is modeled after the
  ! routine GAULEG from NUMERICAL_RECIPES page 136.


  subroutine prep_gaussian_quadrature(a_min, a_max, a, weights, Nintegration_points)

  real(dp), intent(in) :: a_min, a_max            ! The starting and ending points for the
                                                  ! integration over a (see DION equation 14).

  real(dp), intent(inout) :: a(:), weights(:)     ! The points and weights for the
                                                  ! Gaussian-Legendre integration.

  integer, intent(in) :: Nintegration_points      ! The number of integration points desired.

  integer :: Npoints                              ! The number of points we actually have to
                                                  ! calculate. The rest will be obtained from
                                                  ! symmetry.

  real(dp) :: poly_1, poly_2, poly_3              ! Temporary storage for Legendre Polynomials.

  integer :: i_point, i_poly                      ! Indexing variables.

  real(dp) :: root, dp_dx, last_root              ! The value of the root of a given Legendre
                                                  ! polynomial, the derivative of the polynomial at
                                                  ! that root and the value of the root in the last
                                                  ! iteration (to check for convergence of Newton's
                                                  ! method).

  real(dp) :: midpoint, length                    ! The middle of the x-range and the length to
                                                  ! that point.




  Npoints  = (Nintegration_points + 1)/2
  midpoint = 0.5D0 * ( atan(a_min) + atan(a_max) )
  length   = 0.5D0 * ( atan(a_max) - atan(a_min) )

  do i_point = 1, Npoints


     ! -----------------------------------------------------------------
     ! Make an initial guess for the root.

     root = cos(dble(pi*(i_point - 0.25D0)/(Nintegration_points + 0.5D0)))

     do

        ! --------------------------------------------------------------
        ! Use the recurrence relations to find the desired polynomial,
        ! evaluated at the approximate root. See NUMERICAL_RECIPES page
        ! 134.

        poly_1 = 1.0D0
        poly_2 = 0.0D0

        do i_poly = 1, Nintegration_points

           poly_3 = poly_2
           poly_2 = poly_1
           poly_1 = ((2.0D0 * i_poly - 1.0D0)*root*poly_2 - (i_poly-1.0D0)*poly_3)/i_poly

        end do


        ! --------------------------------------------------------------
        ! Use the recurrence relations to find the desired polynomial.
        ! Find the derivative of the polynomial and use it in Newton's
        ! method to refine our guess for the root.

        dp_dx = Nintegration_points * (root*poly_1 - poly_2)/(root**2 - 1.0D0)

        last_root = root
        root = last_root - poly_1/dp_dx


        ! --------------------------------------------------------------
        ! Check for convergence.

        if (abs(root - last_root) <= 1.0D-14) then

           exit

        end if

     end do


     ! -----------------------------------------------------------------
     ! Fill in the array of evaluation points.

     a(i_point) = midpoint - length*root
     a(Nintegration_points + 1 - i_point) = midpoint + length*root


     ! -----------------------------------------------------------------
     ! Fill in the array of weights.

     weights(i_point) = 2.0D0 * length/((1.0D0 - root**2)*dp_dx**2)
     weights(Nintegration_points + 1 - i_point) = weights(i_point)

  end do

  end subroutine prep_gaussian_quadrature








  ! ####################################################################
  !                       |                           |
  !                       |  WRITE_KERNEL_TABLE_FILE  |
  !                       |___________________________|
  !
  ! Subroutine to write out the vdW_kernel_table file. All processors
  ! pass their data to processor 0 which is the one that actually does
  ! the writing. This is the only communication in the entire program.

  subroutine write_kernel_table_file(phi, d2phi_dk2)

  real(dp), target :: phi(:,:), d2phi_dk2(:,:)  ! Each processor passes in its array of kernel
                                                ! values and second derivative values for the
                                                ! q-pairs it calculated. They are stored as
                                                ! phi(index of function, function_values).

  integer :: proc_Nqs                           ! Number of calculated functions for a
                                                ! particular processor.

  real(dp), pointer :: data(:,:)                ! Pointer to point to the needed section of the
                                                ! phi and d2phi_dk2 arrays. This is needed because
                                                ! some processors may have calculated 1 extra
                                                ! function if the number of processors is not an
                                                ! even divisor of (Nqs^2+Nqs)/2. Processor 0 is
                                                ! guaranteed to be one of the ones with an extra
                                                ! calculation (if there are any), so it can collect
                                                ! the arrays from other processors and put it in its
                                                ! array. Data then points to either the entire array
                                                ! (if the other processor also had an extra
                                                ! calculation), or just the first proc_Nqs entries
                                                ! (which is guaranteed to be at most 1 less than the
                                                ! proc_Nqs for processor 0.




  if (ionode) then

     ! -----------------------------------------------------------------
     ! Open the file for writing.  The file is written in binary to save
     ! space.

     open(UNIT=21, FILE='vdW_kernel_table', status='replace', form='formatted', action='write')


     ! -----------------------------------------------------------------
     ! Write the relevant header information that will be read in by the
     ! kernel_table module.

     write(21, '(2i5,f13.8)') Nqs, Nr_points
     write(21, double_format) r_max
     write(21, double_format) q_mesh


     ! -----------------------------------------------------------------
     ! Processor 0 writes its kernel functions first.  The subroutine
     ! "write_data" is defined below.

     data => phi(:,:)
     call write_data(21, data)

  end if


  ! --------------------------------------------------------------------
  ! Now, loop over all other processors (if any) and collect their
  ! kernel functions in the phi array of processor 0, which is big
  ! enough to hold any of them. Figure out how many functions should
  ! have been passed and make data point to just the right amount of the
  ! phi array. Then write the data.

  do proc_i = 1, nproc-1

     call mp_get(phi, phi, mpime, 0, proc_i, 0, my_comm)

     if (ionode) then

        proc_Nqs = proc_indices(proc_i+1, 2) - proc_indices(proc_i+1,1) + 1
        data => phi(:,1:proc_Nqs)
        call write_data(21, data)

     end if

  end do


  ! --------------------------------------------------------------------
  ! Here, we basically repeat the process exactly but for the second
  ! derivatives d2phi_dk2 instead of the kernel itself.

  if (ionode) then

     data => d2phi_dk2(:,:)
     call write_data(21, data)

  end if


  do proc_i = 1, nproc-1

     call mp_get(d2phi_dk2, d2phi_dk2, mpime, 0, proc_i, 0, my_comm)

     if (mpime == 0) then

        proc_Nqs = proc_indices(proc_i+1,2) - proc_indices(proc_i+1,1) + 1

        data => d2phi_dk2(:, 1:proc_Nqs)
        call write_data(21, data)

     end if

  end do


  if (ionode) close(21)

  end subroutine write_kernel_table_file








  ! ####################################################################
  !                           |              |
  !                           |  WRITE_DATA  |
  !                           !______________|
  !
  ! Write matrix data held in the point "array" to the file with unit
  ! number "file". Data is written in binary format.

  subroutine write_data(file, array)

  real(dp), pointer:: array(:,:)     ! Input pointer to the matrix data to be written.

  integer, intent(in) :: file        ! Unit number of file to write to.

  integer :: idx, ios                ! Indexing variable.




  do idx = 1, size(array,2)
     write (file, double_format, err=100, iostat=ios) array(:,idx)
  end do

  100 call errore ('generate_vdW_kernel_table', 'Writing table file', abs (ios) )

  end subroutine write_data








  ! ####################################################################
  !                            |              |
  !                            |  RADIAL_FFT  |
  !                            |______________|
  !
  ! This subroutine performs a radial Fourier transform on the
  ! real-space kernel functions. Basically, this is just
  ! int(4*pi*r^2*phi*sin(k*r)/(k*r))dr integrated from 0 to r_max. That
  ! is, it is the kernel function phi integrated with the 0^th spherical
  ! Bessel function radially, with a 4*pi assumed from angular
  ! integration since we have spherical symmetry. The spherical symmetry
  ! comes in because the kernel function depends only on the magnitude
  ! of the vector between two points. The integration is done using the
  ! trapezoid rule.


  subroutine radial_fft(phi)

  real(dp), intent(inout) :: phi(0:Nr_points)   ! On input holds the real-space function
                                                ! phi_q1_q2(r). On output hold the
                                                ! reciprocal-space function phi_q1_q2(k).

  real(dp) :: phi_k(0:Nr_points)                ! Temporary storage for phi_q1_q2(k).

  real(dp) :: dr = r_max/Nr_points              ! Spacing between real-space sample points.

  real(dp) :: dk = 2.0D0*pi/r_max               ! Spacing between reciprocal space sample points.

  integer :: k_i, r_i                           ! Indexing variables.

  real(dp) :: r, k                              ! The real and reciprocal space points.


  phi_k = 0.0D0


  ! --------------------------------------------------------------------
  ! Handle the k=0 point separately.

  do r_i = 1, Nr_points

     r        = r_i * dr
     phi_k(0) = phi_k(0) + phi(r_i)*r**2

  end do


  ! --------------------------------------------------------------------
  ! Subtract half of the last value of because of the trapezoid rule.

  phi_k(0) = phi_k(0) - 0.5D0 * (Nr_points*dr)**2 * phi(Nr_points)


  ! --------------------------------------------------------------------
  ! Integration for the rest of the k-points.

  do k_i = 1, Nr_points

     k = k_i * dk

     do r_i = 1, Nr_points

        r          = r_i * dr
        phi_k(k_i) = phi_k(k_i) + phi(r_i) * r * sin(k*r) / k

     end do

     phi_k(Nr_points) = phi_k(Nr_points) - 0.5D0 * phi(Nr_points) * r *sin(k*r) / k

  end do

  ! --------------------------------------------------------------------
  ! Add in the 4*pi and the dr factor for the integration.

  phi = 4.0D0 * pi * phi_k * dr

  end subroutine radial_fft


end subroutine generate_kernel
